{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import os\n",
    "from collections import OrderedDict\n",
    "from pathlib import Path\n",
    "import gzip\n",
    "import json\n",
    "\n",
    "import pydicom\n",
    "from pydicom._dicom_dict import DicomDictionary\n",
    "\n",
    "# we need the location of MIMIC-CXR 2.0.0\n",
    "# we use this to get cxr-records-list.csv.gz\n",
    "mimic_cxr_path = Path('/db/mimic-cxr')\n",
    "\n",
    "# we also need dicom-metadata.csv.gz and dicom-metadata.json.gz generated\n",
    "# these are generated by export_metadata.py in this folder."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In order to store sequences from the DICOM, we created a JSON. We will load in that JSON now."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "There are 10 top-level attributes in the DICOM json.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{'528434': [{'524544': 'C12',\n",
       "   '524546': 'CLP',\n",
       "   '524548': 'CHEST (PORTABLE AP)'}],\n",
       " '533016': [{'524544': 'T-D3000',\n",
       "   '524546': 'SNM3',\n",
       "   '524548': 'Chest',\n",
       "   '524549': 'DCMR',\n",
       "   '524550': '20020904',\n",
       "   '524559': '4031'}],\n",
       " '1179748': [{'524544': '113100',\n",
       "   '524546': 'DCM',\n",
       "   '524547': '20170914',\n",
       "   '524548': 'Basic Application Confidentiality Profile'},\n",
       "  {'524544': '113105',\n",
       "   '524546': 'DCM',\n",
       "   '524547': '20170914',\n",
       "   '524548': 'Clean Descriptors Option'},\n",
       "  {'524544': '113107',\n",
       "   '524546': 'DCM',\n",
       "   '524547': '20170914',\n",
       "   '524548': 'Retain Longitudinal Temporal Information Modified Dates Option'},\n",
       "  {'524544': '113101',\n",
       "   '524546': 'DCM',\n",
       "   '524547': '20170914',\n",
       "   '524548': 'Clean Pixel Data Option'},\n",
       "  {'524544': '113103',\n",
       "   '524546': 'DCM',\n",
       "   '524547': '20170914',\n",
       "   '524548': 'Clean Graphics Option'}],\n",
       " '5505568': [{'524544': 'R-10206',\n",
       "   '524546': 'SNM3',\n",
       "   '524548': 'antero-posterior',\n",
       "   '524549': 'DCMR',\n",
       "   '524550': '20040302',\n",
       "   '524559': '4010'}],\n",
       " '5506064': [{'524544': 'F-10440',\n",
       "   '524546': 'SNM3',\n",
       "   '524548': 'Erect',\n",
       "   '524549': 'DCMR',\n",
       "   '524550': '20020904',\n",
       "   '524559': '19'}],\n",
       " '7340033': [{}]}"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# load json\n",
    "with gzip.open('dicom-metadata.json.gz', 'r') as fp:\n",
    "    tmp = json.load(fp)\n",
    "\n",
    "dcm_metadata = dict()\n",
    "# convert from length list of 1 item dicts to single dict\n",
    "for d in tmp:\n",
    "    for k, v in d.items():\n",
    "        dcm_metadata[k] = v\n",
    "        \n",
    "del tmp\n",
    "\n",
    "# figure out how many unique top level meta-data fields in the json\n",
    "# also get a list of all the top level tags\n",
    "json_keys = [list(dcm_metadata[x].keys()) for x in dcm_metadata]\n",
    "json_keys = set([int(item) for sublist in json_keys for item in sublist])\n",
    "json_keys = list(json_keys)\n",
    "json_keys.sort()\n",
    "\n",
    "n_attrib = len(json_keys)\n",
    "print(f'There are {n_attrib} top-level attributes in the DICOM json.')\n",
    "\n",
    "# show an example\n",
    "dcm_metadata['000046e4-e4d7f796-72c3dba4-8b67a485-0eea211d']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "There are two very useful items in this sequence that we'd like to have in an easier form for all images: the procedure code sequence (`'528434'`), the coded view position (`'5505568'`), and the coded patient orientation (`'5506064'`). For convenience, we will pull the textual description of each (`'524548'`), rather than the ontology code itself."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ProcedureCodeSequence_CodeMeaning</th>\n",
       "      <th>ViewCodeSequence_CodeMeaning</th>\n",
       "      <th>PatientOrientationCodeSequence_CodeMeaning</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>02aa804e-bde0afdd-112c0b34-7bc16630-4e384014</th>\n",
       "      <td>CHEST (PA AND LAT)</td>\n",
       "      <td>postero-anterior</td>\n",
       "      <td>Erect</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962</th>\n",
       "      <td>CHEST (PA AND LAT)</td>\n",
       "      <td>lateral</td>\n",
       "      <td>Erect</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab</th>\n",
       "      <td>CHEST (PA AND LAT)</td>\n",
       "      <td>postero-anterior</td>\n",
       "      <td>Erect</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c</th>\n",
       "      <td>CHEST (PA AND LAT)</td>\n",
       "      <td>lateral</td>\n",
       "      <td>Erect</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714</th>\n",
       "      <td>CHEST (PORTABLE AP)</td>\n",
       "      <td>antero-posterior</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                             ProcedureCodeSequence_CodeMeaning  \\\n",
       "02aa804e-bde0afdd-112c0b34-7bc16630-4e384014                CHEST (PA AND LAT)   \n",
       "174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962                CHEST (PA AND LAT)   \n",
       "2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab                CHEST (PA AND LAT)   \n",
       "e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c                CHEST (PA AND LAT)   \n",
       "68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714               CHEST (PORTABLE AP)   \n",
       "\n",
       "                                             ViewCodeSequence_CodeMeaning  \\\n",
       "02aa804e-bde0afdd-112c0b34-7bc16630-4e384014             postero-anterior   \n",
       "174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962                      lateral   \n",
       "2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab             postero-anterior   \n",
       "e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c                      lateral   \n",
       "68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714             antero-posterior   \n",
       "\n",
       "                                             PatientOrientationCodeSequence_CodeMeaning  \n",
       "02aa804e-bde0afdd-112c0b34-7bc16630-4e384014                                      Erect  \n",
       "174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962                                      Erect  \n",
       "2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab                                      Erect  \n",
       "e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c                                      Erect  \n",
       "68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714                                       None  "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cols = ['528434', '5505568', '5506064']\n",
    "dcm_metadata_simple = {}\n",
    "for k, v in dcm_metadata.items():\n",
    "    dcm_metadata_simple[k] = [v[c][0]['524548']\n",
    "                              for c in cols\n",
    "                              if c in v and len(v[c])>0]\n",
    "dcm_metadata_simple = pd.DataFrame.from_dict(dcm_metadata_simple, orient='index')\n",
    "\n",
    "# convert columns to be human readable\n",
    "dcm_metadata_simple.columns = [DicomDictionary[int(c)][-1] + '_' + DicomDictionary[int('524548')][-1]  for c in cols]\n",
    "dcm_metadata_simple.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>524293</th>\n",
       "      <th>524296</th>\n",
       "      <th>524310</th>\n",
       "      <th>524312</th>\n",
       "      <th>524320</th>\n",
       "      <th>524321</th>\n",
       "      <th>524322</th>\n",
       "      <th>524323</th>\n",
       "      <th>524336</th>\n",
       "      <th>524337</th>\n",
       "      <th>...</th>\n",
       "      <th>1578288</th>\n",
       "      <th>1610546</th>\n",
       "      <th>4194912</th>\n",
       "      <th>1577040</th>\n",
       "      <th>1577236</th>\n",
       "      <th>1605968</th>\n",
       "      <th>4195073</th>\n",
       "      <th>4195086</th>\n",
       "      <th>4198403</th>\n",
       "      <th>1577328</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>dicom_id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>02aa804e-bde0afdd-112c0b34-7bc16630-4e384014</th>\n",
       "      <td>ISO_IR 100</td>\n",
       "      <td>['DERIVED', 'PRIMARY']</td>\n",
       "      <td>1.2.840.10008.5.1.4.1.1.1.1</td>\n",
       "      <td>2.25.3543748844510614920925352225862149680</td>\n",
       "      <td>21800506</td>\n",
       "      <td>21800506</td>\n",
       "      <td>21800506.0</td>\n",
       "      <td>21800506</td>\n",
       "      <td>213014.531</td>\n",
       "      <td>213026.750</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962</th>\n",
       "      <td>ISO_IR 100</td>\n",
       "      <td>['DERIVED', 'PRIMARY']</td>\n",
       "      <td>1.2.840.10008.5.1.4.1.1.1.1</td>\n",
       "      <td>2.25.30925724177439423411425919179398157560</td>\n",
       "      <td>21800506</td>\n",
       "      <td>21800506</td>\n",
       "      <td>21800506.0</td>\n",
       "      <td>21800506</td>\n",
       "      <td>213014.531</td>\n",
       "      <td>213133.484</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab</th>\n",
       "      <td>ISO_IR 100</td>\n",
       "      <td>['DERIVED', 'PRIMARY']</td>\n",
       "      <td>1.2.840.10008.5.1.4.1.1.1.1</td>\n",
       "      <td>2.25.56006540967197077610238991327864082702</td>\n",
       "      <td>21800626</td>\n",
       "      <td>21800626</td>\n",
       "      <td>21800626.0</td>\n",
       "      <td>21800626</td>\n",
       "      <td>165500.312</td>\n",
       "      <td>165512.437</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c</th>\n",
       "      <td>ISO_IR 100</td>\n",
       "      <td>['DERIVED', 'PRIMARY']</td>\n",
       "      <td>1.2.840.10008.5.1.4.1.1.1.1</td>\n",
       "      <td>2.25.298436961669509509569879822879236656638</td>\n",
       "      <td>21800626</td>\n",
       "      <td>21800626</td>\n",
       "      <td>21800626.0</td>\n",
       "      <td>21800626</td>\n",
       "      <td>165500.312</td>\n",
       "      <td>165558.968</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714</th>\n",
       "      <td>ISO_IR 100</td>\n",
       "      <td>['DERIVED', 'PRIMARY']</td>\n",
       "      <td>1.2.840.10008.5.1.4.1.1.1.1</td>\n",
       "      <td>2.25.139183506679367140539825912154983541585</td>\n",
       "      <td>21800723</td>\n",
       "      <td>21800723</td>\n",
       "      <td>21800723.0</td>\n",
       "      <td>21800723</td>\n",
       "      <td>80556.875</td>\n",
       "      <td>80714.500</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 150 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                  524293  \\\n",
       "dicom_id                                                   \n",
       "02aa804e-bde0afdd-112c0b34-7bc16630-4e384014  ISO_IR 100   \n",
       "174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962  ISO_IR 100   \n",
       "2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab  ISO_IR 100   \n",
       "e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c  ISO_IR 100   \n",
       "68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714  ISO_IR 100   \n",
       "\n",
       "                                                              524296  \\\n",
       "dicom_id                                                               \n",
       "02aa804e-bde0afdd-112c0b34-7bc16630-4e384014  ['DERIVED', 'PRIMARY']   \n",
       "174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962  ['DERIVED', 'PRIMARY']   \n",
       "2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab  ['DERIVED', 'PRIMARY']   \n",
       "e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c  ['DERIVED', 'PRIMARY']   \n",
       "68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714  ['DERIVED', 'PRIMARY']   \n",
       "\n",
       "                                                                   524310  \\\n",
       "dicom_id                                                                    \n",
       "02aa804e-bde0afdd-112c0b34-7bc16630-4e384014  1.2.840.10008.5.1.4.1.1.1.1   \n",
       "174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962  1.2.840.10008.5.1.4.1.1.1.1   \n",
       "2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab  1.2.840.10008.5.1.4.1.1.1.1   \n",
       "e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c  1.2.840.10008.5.1.4.1.1.1.1   \n",
       "68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714  1.2.840.10008.5.1.4.1.1.1.1   \n",
       "\n",
       "                                                                                    524312  \\\n",
       "dicom_id                                                                                     \n",
       "02aa804e-bde0afdd-112c0b34-7bc16630-4e384014    2.25.3543748844510614920925352225862149680   \n",
       "174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962   2.25.30925724177439423411425919179398157560   \n",
       "2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab   2.25.56006540967197077610238991327864082702   \n",
       "e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c  2.25.298436961669509509569879822879236656638   \n",
       "68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714  2.25.139183506679367140539825912154983541585   \n",
       "\n",
       "                                                524320    524321      524322  \\\n",
       "dicom_id                                                                       \n",
       "02aa804e-bde0afdd-112c0b34-7bc16630-4e384014  21800506  21800506  21800506.0   \n",
       "174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962  21800506  21800506  21800506.0   \n",
       "2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab  21800626  21800626  21800626.0   \n",
       "e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c  21800626  21800626  21800626.0   \n",
       "68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714  21800723  21800723  21800723.0   \n",
       "\n",
       "                                                524323      524336  \\\n",
       "dicom_id                                                             \n",
       "02aa804e-bde0afdd-112c0b34-7bc16630-4e384014  21800506  213014.531   \n",
       "174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962  21800506  213014.531   \n",
       "2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab  21800626  165500.312   \n",
       "e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c  21800626  165500.312   \n",
       "68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714  21800723   80556.875   \n",
       "\n",
       "                                                  524337  ...  1578288  \\\n",
       "dicom_id                                                  ...            \n",
       "02aa804e-bde0afdd-112c0b34-7bc16630-4e384014  213026.750  ...      NaN   \n",
       "174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962  213133.484  ...      NaN   \n",
       "2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab  165512.437  ...      NaN   \n",
       "e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c  165558.968  ...      NaN   \n",
       "68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714   80714.500  ...      NaN   \n",
       "\n",
       "                                              1610546  4194912 1577040  \\\n",
       "dicom_id                                                                 \n",
       "02aa804e-bde0afdd-112c0b34-7bc16630-4e384014      NaN      NaN     NaN   \n",
       "174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962      NaN      NaN     NaN   \n",
       "2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab      NaN      NaN     NaN   \n",
       "e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c      NaN      NaN     NaN   \n",
       "68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714      NaN      NaN     NaN   \n",
       "\n",
       "                                             1577236  1605968  4195073  \\\n",
       "dicom_id                                                                 \n",
       "02aa804e-bde0afdd-112c0b34-7bc16630-4e384014     NaN      NaN      NaN   \n",
       "174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962     NaN      NaN      NaN   \n",
       "2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab     NaN      NaN      NaN   \n",
       "e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c     NaN      NaN      NaN   \n",
       "68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714     NaN      NaN      NaN   \n",
       "\n",
       "                                              4195086  4198403  1577328  \n",
       "dicom_id                                                                 \n",
       "02aa804e-bde0afdd-112c0b34-7bc16630-4e384014      NaN      NaN      NaN  \n",
       "174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962      NaN      NaN      NaN  \n",
       "2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab      NaN      NaN      NaN  \n",
       "e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c      NaN      NaN      NaN  \n",
       "68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714      NaN      NaN      NaN  \n",
       "\n",
       "[5 rows x 150 columns]"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "metadata.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>subject_id</th>\n",
       "      <th>study_id</th>\n",
       "      <th>PerformedProcedureStepDescription</th>\n",
       "      <th>ViewPosition</th>\n",
       "      <th>Rows</th>\n",
       "      <th>Columns</th>\n",
       "      <th>StudyDate</th>\n",
       "      <th>StudyTime</th>\n",
       "      <th>AcquisitionDeviceProcessingDescription</th>\n",
       "      <th>ProcedureCodeSequence_CodeMeaning</th>\n",
       "      <th>ViewCodeSequence_CodeMeaning</th>\n",
       "      <th>PatientOrientationCodeSequence_CodeMeaning</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>dicom_id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>02aa804e-bde0afdd-112c0b34-7bc16630-4e384014</th>\n",
       "      <td>10000032</td>\n",
       "      <td>50414267</td>\n",
       "      <td>CHEST (PA AND LAT)</td>\n",
       "      <td>PA</td>\n",
       "      <td>3056</td>\n",
       "      <td>2544</td>\n",
       "      <td>21800506</td>\n",
       "      <td>213014.531</td>\n",
       "      <td>NaN</td>\n",
       "      <td>CHEST (PA AND LAT)</td>\n",
       "      <td>postero-anterior</td>\n",
       "      <td>Erect</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962</th>\n",
       "      <td>10000032</td>\n",
       "      <td>50414267</td>\n",
       "      <td>CHEST (PA AND LAT)</td>\n",
       "      <td>LATERAL</td>\n",
       "      <td>3056</td>\n",
       "      <td>2544</td>\n",
       "      <td>21800506</td>\n",
       "      <td>213014.531</td>\n",
       "      <td>NaN</td>\n",
       "      <td>CHEST (PA AND LAT)</td>\n",
       "      <td>lateral</td>\n",
       "      <td>Erect</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab</th>\n",
       "      <td>10000032</td>\n",
       "      <td>53189527</td>\n",
       "      <td>CHEST (PA AND LAT)</td>\n",
       "      <td>PA</td>\n",
       "      <td>3056</td>\n",
       "      <td>2544</td>\n",
       "      <td>21800626</td>\n",
       "      <td>165500.312</td>\n",
       "      <td>NaN</td>\n",
       "      <td>CHEST (PA AND LAT)</td>\n",
       "      <td>postero-anterior</td>\n",
       "      <td>Erect</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c</th>\n",
       "      <td>10000032</td>\n",
       "      <td>53189527</td>\n",
       "      <td>CHEST (PA AND LAT)</td>\n",
       "      <td>LATERAL</td>\n",
       "      <td>3056</td>\n",
       "      <td>2544</td>\n",
       "      <td>21800626</td>\n",
       "      <td>165500.312</td>\n",
       "      <td>NaN</td>\n",
       "      <td>CHEST (PA AND LAT)</td>\n",
       "      <td>lateral</td>\n",
       "      <td>Erect</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714</th>\n",
       "      <td>10000032</td>\n",
       "      <td>53911762</td>\n",
       "      <td>CHEST (PORTABLE AP)</td>\n",
       "      <td>AP</td>\n",
       "      <td>2705</td>\n",
       "      <td>2539</td>\n",
       "      <td>21800723</td>\n",
       "      <td>80556.875</td>\n",
       "      <td>NaN</td>\n",
       "      <td>CHEST (PORTABLE AP)</td>\n",
       "      <td>antero-posterior</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                              subject_id  study_id  \\\n",
       "dicom_id                                                             \n",
       "02aa804e-bde0afdd-112c0b34-7bc16630-4e384014    10000032  50414267   \n",
       "174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962    10000032  50414267   \n",
       "2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab    10000032  53189527   \n",
       "e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c    10000032  53189527   \n",
       "68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714    10000032  53911762   \n",
       "\n",
       "                                             PerformedProcedureStepDescription  \\\n",
       "dicom_id                                                                         \n",
       "02aa804e-bde0afdd-112c0b34-7bc16630-4e384014                CHEST (PA AND LAT)   \n",
       "174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962                CHEST (PA AND LAT)   \n",
       "2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab                CHEST (PA AND LAT)   \n",
       "e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c                CHEST (PA AND LAT)   \n",
       "68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714               CHEST (PORTABLE AP)   \n",
       "\n",
       "                                             ViewPosition  Rows  Columns  \\\n",
       "dicom_id                                                                   \n",
       "02aa804e-bde0afdd-112c0b34-7bc16630-4e384014           PA  3056     2544   \n",
       "174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962      LATERAL  3056     2544   \n",
       "2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab           PA  3056     2544   \n",
       "e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c      LATERAL  3056     2544   \n",
       "68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714           AP  2705     2539   \n",
       "\n",
       "                                              StudyDate   StudyTime  \\\n",
       "dicom_id                                                              \n",
       "02aa804e-bde0afdd-112c0b34-7bc16630-4e384014   21800506  213014.531   \n",
       "174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962   21800506  213014.531   \n",
       "2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab   21800626  165500.312   \n",
       "e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c   21800626  165500.312   \n",
       "68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714   21800723   80556.875   \n",
       "\n",
       "                                             AcquisitionDeviceProcessingDescription  \\\n",
       "dicom_id                                                                              \n",
       "02aa804e-bde0afdd-112c0b34-7bc16630-4e384014                                    NaN   \n",
       "174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962                                    NaN   \n",
       "2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab                                    NaN   \n",
       "e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c                                    NaN   \n",
       "68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714                                    NaN   \n",
       "\n",
       "                                             ProcedureCodeSequence_CodeMeaning  \\\n",
       "dicom_id                                                                         \n",
       "02aa804e-bde0afdd-112c0b34-7bc16630-4e384014                CHEST (PA AND LAT)   \n",
       "174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962                CHEST (PA AND LAT)   \n",
       "2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab                CHEST (PA AND LAT)   \n",
       "e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c                CHEST (PA AND LAT)   \n",
       "68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714               CHEST (PORTABLE AP)   \n",
       "\n",
       "                                             ViewCodeSequence_CodeMeaning  \\\n",
       "dicom_id                                                                    \n",
       "02aa804e-bde0afdd-112c0b34-7bc16630-4e384014             postero-anterior   \n",
       "174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962                      lateral   \n",
       "2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab             postero-anterior   \n",
       "e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c                      lateral   \n",
       "68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714             antero-posterior   \n",
       "\n",
       "                                             PatientOrientationCodeSequence_CodeMeaning  \n",
       "dicom_id                                                                                 \n",
       "02aa804e-bde0afdd-112c0b34-7bc16630-4e384014                                      Erect  \n",
       "174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962                                      Erect  \n",
       "2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab                                      Erect  \n",
       "e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c                                      Erect  \n",
       "68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714                                       None  "
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# load in MIMIC-CXR 2.0.0 record list\n",
    "records =  pd.read_csv(mimic_cxr_path / 'cxr-record-list.csv.gz')\n",
    "records.set_index('dicom_id', inplace=True)\n",
    "\n",
    "# load in a CSV of meta-data derived from MIMIC-CXR\n",
    "metadata = pd.read_csv('dicom-metadata.csv.gz', index_col=0)\n",
    "metadata.index.name = 'dicom_id'\n",
    "\n",
    "# subselect to useful metadata\n",
    "metadata = metadata[['4194900', '1593601', '2621456', '2621457', '524320', '524336', '1577984']]\n",
    "\n",
    "# rename columns to be human readable\n",
    "metadata.columns = [DicomDictionary[int(c)][-1] for c in metadata.columns]\n",
    "\n",
    "# merge into records\n",
    "metadata = records[['subject_id', 'study_id']].merge(\n",
    "    metadata, how='left', left_index=True, right_index=True\n",
    ")\n",
    "\n",
    "# add in the metadata from the JSON file\n",
    "metadata = metadata.merge(\n",
    "    dcm_metadata_simple, how='left', left_index=True, right_index=True\n",
    ")\n",
    "metadata.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "metadata.sort_values(['subject_id', 'study_id'], inplace=True)\n",
    "metadata.to_csv('mimic-cxr-2.0.0-metadata.csv.gz', index=True, compression='gzip')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "mimic-cxr",
   "language": "python",
   "name": "mimic-cxr"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
